Analysis of Online News Popularity Dataset (https://archive.ics.uci.edu/ml/datasets/Online+News+Popularity): explore the statistical summaries of the features, visualize the attributes, and make conclusions from the visualizations and analysis
This Online News Popularity Dataset was acquisited from Mashable (https://mashable.com) on 01/08/2015 and the goal is to predict the number of shares in social networks (popularity).
Attribute Information:
Describe the meaning and type of data (scale, values, etc.) for each attribute in the data file.
Number of Attributes: 61 (58 predictive attributes, 2 non-predictive, 1 goal field)
url: URL of the article (non-predictive)
Can use Manisha's code for presentation
# Import libraries which will be uses for Lab_01 project
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Read csv file
df = pd.read_csv('/Users/shanqinggu/Desktop/OnlineNewsPopularity.csv')
df.info()
# Exclude url and timedelta columns, read from n_tokens_title
df = df.loc[:, ' n_tokens_title':]
df.head() # use df.tail() to read from the bottom
# Combine and make 'channel', will fix the warn contents below
Lifestyle_df=df[df[' data_channel_is_lifestyle']==1]
Lifestyle_df[' Channel']='Lifestyle'
Entertainment_df=df[df[' data_channel_is_entertainment']==1]
Entertainment_df[' Channel']='Entertainment'
Bus_df=df[df[' data_channel_is_bus']==1]
Bus_df[' Channel']='Bus'
Socmed_df=df[df[' data_channel_is_socmed']==1]
Socmed_df[' Channel']='Socmedia'
Tech_df=df[df[' data_channel_is_tech']==1]
Tech_df[' Channel']='Tech'
World_df=df[df[' data_channel_is_world']==1]
World_df[' Channel']='World'
df=pd.concat([Lifestyle_df,Entertainment_df,Bus_df,Socmed_df,Tech_df,World_df],axis=0)
# Combine and make 'weekday', will fix the warn contents below
Monday_df=df[df[' weekday_is_monday']==1]
Monday_df[' weekday']='Monday'
Tuesday_df=df[df[' weekday_is_tuesday']==1]
Tuesday_df[' weekday']='Tuesday'
Wednesday_df=df[df[' weekday_is_wednesday']==1]
Wednesday_df[' weekday']='Wednesday'
Thursday_df=df[df[' weekday_is_thursday']==1]
Thursday_df[' weekday']='Thursday'
Friday_df=df[df[' weekday_is_friday']==1]
Friday_df[' weekday']='Friday'
Saturday_df=df[df[' weekday_is_saturday']==1]
Saturday_df[' weekday']='Saturday'
Sunday_df=df[df[' weekday_is_sunday']==1]
Sunday_df[' weekday']='Sunday'
df=pd.concat([Monday_df,Tuesday_df,Wednesday_df,Thursday_df,Friday_df,Saturday_df,Sunday_df],axis=0)
# Check column location and prepare to drop
df.columns.get_loc(' data_channel_is_lifestyle')
df.columns.get_loc(' data_channel_is_world')
df.columns.get_loc(' weekday_is_monday')
df.columns.get_loc(' is_weekend')
df.columns[[11, 12, 13, 14, 15, 16, 29, 30, 31,32, 33, 34, 35, 36 ]]
# Remove previous channel and weekly columns
df.drop(df.columns[[11, 12, 13, 14, 15, 16, 29, 30, 31,32, 33, 34, 35, 36 ]], axis=1, inplace=True)
# No Missing values in this dataset
pd.isnull(df).sum()
# No duplicated values in this dataset
df[df.duplicated(keep=False)]
# Outliers will be handled by log transformation due to the sample numbers are more than 30k
# Find which variables need to do log transform
df_T = df.describe().T
df_T["log"] = (df_T["max"] > df_T["50%"] * 10) & (df_T["max"] > 1)
df_T["log+2"] = df_T["log"] & (df_T["min"] < 0)
df_T["scale"] = ""
df_T.loc[df_T["log"],"scale"] = "log"
df_T.loc[df_T["log+2"],"scale"] = "log+2"
df_T[["mean", "min", "50%", "max", "scale"]]
# Log transform 18 variables.
df['log_n_tokens_content'] = np.log(df[' n_tokens_content'] + 0.1)
df['log_n_unique_tokens'] = np.log(df[' n_unique_tokens'] + 0.1)
df['log_n_non_stop_words'] = np.log(df[' n_non_stop_words'] + 0.1)
df['log_n_non_stop_unique_tokens'] = np.log(df[' n_non_stop_unique_tokens'] + 0.1)
df['log_num_hrefs'] = np.log(df[' num_hrefs'] + 0.1)
df['log_num_self_hrefs'] = np.log(df[' num_self_hrefs'] + 0.1)
df['log_num_imgs'] = np.log(df[' num_imgs'] + 0.1)
df['log_num_videos'] = np.log(df[' num_videos'] + 0.1)
df['log_kw_min_min'] = np.log(df[' kw_min_min'] + 2)
df['log_kw_max_min'] = np.log(df[' kw_max_min'] + 0.1)
df['log_kw_avg_min'] = np.log(df[' kw_avg_min'] + 2)
df['log_kw_min_max'] = np.log(df[' kw_min_max'] + 0.1)
df['log_kw_max_avg'] = np.log(df[' kw_max_avg'] + 0.1)
df['log_kw_avg_avg'] = np.log(df[' kw_avg_avg'] + 0.1)
df['log_self_reference_min_shares'] = np.log(df[' self_reference_min_shares'] + 0.1)
df['log_self_reference_max_shares'] = np.log(df[' self_reference_max_shares'] + 0.1)
df['log_self_reference_avg_sharess'] = np.log(df[' self_reference_avg_sharess'] + 0.1)
df['log_shares'] = np.log(df[' shares'] + 0.1)
# find locations for untransformed
df.columns[[1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 18, 19, 20, 21, 22, 44]]
# Drop the above
df.drop(df.columns[[1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 18, 19, 20, 21, 22, 44]], axis=1, inplace=True)
# Check if everything correct so far
df.dtypes
# Linear Regression analysis. This part will be moved to # 9. Put here just to show the process to pick interesting variables.
class_y = df.log_shares
class_X = df.drop(['log_shares', ' Channel', ' weekday'], axis=1) # axis = 1 - column
import statsmodels.api as sm
class_X = sm.add_constant(class_X)
ls_model = sm.OLS(class_y.astype(float), class_X.astype(float)).fit()
ls_model.summary()
!!! Strong multicollinearity
Choose 20 from above and remove problem varibles based on pairplot (not shown)
average_token_length num_keywords kw_max_max kw_min_avg global_subjectivity title_sentiment_polarity abs_title_subjectivity log_n_tokens_content log_n_unique_tokens log_num_hrefs log_num_self_hrefs log_num_imgs log_num_videos log_kw_max_min log_kw_avg_min log_kw_min_max log_kw_avg_avg log_self_reference_min_shares log_self_reference_max_shares log_self_reference_avg_sharess
# Remove some varibles after checking pairplot and keep 12 and target 'log_shares'.
df_clean = df[[' average_token_length', ' num_keywords', ' global_subjectivity',' title_sentiment_polarity',
' abs_title_subjectivity', 'log_n_tokens_content', 'log_n_unique_tokens','log_num_hrefs',
'log_num_self_hrefs','log_num_imgs', 'log_num_videos', 'log_self_reference_avg_sharess', 'log_shares']]
# Recheck linear regression analysis.
clean_y = df_clean.log_shares
clean_X = df_clean.drop(['log_shares'], axis=1) # axis = 1 - column
import statsmodels.api as sm
clean_X = sm.add_constant(clean_X)
clean_ls_model = sm.OLS(clean_y.astype(float), clean_X.astype(float)).fit()
clean_ls_model.summary()
!!! no multicollinearity now
# Quick statistic summary of the data
df.describe().transpose()
# Most interesting attributes (12 and 1), just the interesting variables
df_clean.hist(figsize=(12,12))
# Put the pairplot or scatterplot for all the varibles or just mention we did it?
# Pairplot for log transformed variables, as grouped by Channel
sns.pairplot(df, vars=[' average_token_length', ' num_keywords', ' global_subjectivity',' title_sentiment_polarity',
' abs_title_subjectivity', 'log_n_tokens_content', 'log_n_unique_tokens','log_num_hrefs',
'log_num_self_hrefs','log_num_imgs', 'log_num_videos', 'log_self_reference_avg_sharess'], hue=" Channel", palette="husl", size=3)
# Pairplot for log transformed variables, as grouped by Weekday
sns.pairplot(df, vars=[' average_token_length', ' num_keywords', ' global_subjectivity',' title_sentiment_polarity',
' abs_title_subjectivity', 'log_n_tokens_content', 'log_n_unique_tokens','log_num_hrefs',
'log_num_self_hrefs','log_num_imgs', 'log_num_videos', 'log_self_reference_avg_sharess'], hue=" weekday", palette="husl", size=2)
numeric = [c for i,c in enumerate(df.columns) if df.dtypes[i] in [np.float64, np.int64]]
len(numeric)
# Correlation
cmap = sns.diverging_palette(255, 133, l=60, n=7, as_cmap=True, center="dark")
sns.clustermap(df[numeric].corr(), figsize=(14, 14), cmap=cmap);
# plot the correlation matrix using seaborn
sns.set(style="darkgrid") # one of the many styles to plot using
df_heatmap = plt.subplots(figsize=(10, 10))
sns.heatmap(df.corr(), cmap="BuPu")
# Cut log_shares into 2 groups (0, 1)
df_cut = df['log_shares_cut'] = pd.qcut(df['log_shares'], 2, labels = ('unpopular', 'popular'))
# df['log_shares_cut'] = pd.qcut(df['log_shares'], 3, labels = False)
# pd.qcut(range(5), 3, labels=["good","medium","bad"])
# Get 'log_shares' position
df.columns.get_loc('log_shares')
# Drop the above column
df.drop(df.columns[46], axis=1, inplace=True)
# Samples for pairplot as group by the log_share_cut (0, 1)
sns.pairplot(df, vars = [' average_token_length', ' num_keywords', ' global_subjectivity',' title_sentiment_polarity',
' abs_title_subjectivity', 'log_n_tokens_content', 'log_n_unique_tokens','log_num_hrefs',
'log_num_self_hrefs','log_num_imgs', 'log_num_videos', 'log_self_reference_avg_sharess'], hue = "log_shares_cut", palette="husl", size=2)
# Pick log transformed variables, transform and prepare for PCA
from sklearn.preprocessing import StandardScaler
features = [' average_token_length', ' num_keywords', ' global_subjectivity',' title_sentiment_polarity',
' abs_title_subjectivity', 'log_n_tokens_content', 'log_n_unique_tokens','log_num_hrefs',
'log_num_self_hrefs','log_num_imgs', 'log_num_videos', 'log_self_reference_avg_sharess']
# Separating out the features
x = df.loc[:, features].values
# Separating out the target
y = df.loc[:,['log_shares_cut']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)
# Try PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component 1', 'principal component 2'])
# Concat two component and prepare to plot
finalDf = pd.concat([principalDf, df[['log_shares_cut']]], axis = 1)
finalDf.head(10)
# Plot 2 component PCA
fig = plt.figure(figsize = (6,6))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
log_shares_cuts = ['unpopular', 'popular'] # 0 = unpopular, 1 = popular
colors = ['r', 'b']
for log_shares_cut, color in zip(log_shares_cuts, colors):
indicesToKeep = finalDf['log_shares_cut'] == log_shares_cut
ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
, finalDf.loc[indicesToKeep, 'principal component 2']
, c = color
, s = 15)
ax.legend(log_shares_cuts)
ax.grid()